# read the microarray files, normalize the expression data with fRMA,
# annotate and select probes, save the normalized expression values for each gene,
# the boolean state and the expression probability of genes


# library(GEOquery) # de-comment if you want to automatically download CEL files from GEO
library(affy)
library(frma)
source("../scripts/barcode-weights.r")

# Download and read CEL files
samples = read.table("samples.txt",header=TRUE,stringsAsFactors=FALSE)
# for(x in samples$dataset) getGEOSuppFiles(x,makeDirectory=FALSE)  # de-comment if you want to automatically download CEL files from GEO
b <- ReadAffy()

# Normalization with fRMA
if (b@annotation %in% c("pd.hugene.1.0.st.v1","hugene10stv1") ){
	library(oligo)
	system("gunzip *.CEL.gz")
	b = read.celfiles(list.celfiles())
	frmab = frma(b,target="core")
} else {
	frmab = frma(b)
}
M = exprs(frmab)

# Add gene symbol
anno = frmab@annotation
if(anno=='hgu133plus2'){
    library(hgu133plus2.db)
    map = select(hgu133plus2.db, rownames(M), c("SYMBOL","ENTREZID", "GENENAME"))
} else if(anno=='hgu133a'){
    library(hgu133a.db)
    map = select(hgu133a.db, rownames(M), c("SYMBOL","ENTREZID", "GENENAME"))
} else if(anno=='hgu133a2'){
    library(hgu133a2.db)
    map = select(hgu133a2.db, rownames(M), c("SYMBOL","ENTREZID", "GENENAME"))
} else if(anno=='mouse4302'){
	library(mouse4302.db)
	map1 = select(mouse4302.db, rownames(M), c("SYMBOL","ENTREZID", "GENENAME"))
	# find the human homologs to mouse genes
	library(biomaRt)
	mmart = useEnsembl("ensembl",dataset = "mmusculus_gene_ensembl")
	hmart =  useEnsembl("ensembl",dataset = "hsapiens_gene_ensembl")
	dic = getLDS(attributes = c("affy_mouse430_2"), filters = "affy_mouse430_2", values = map1$PROBEID, mart = mmart, attributesL = c("hgnc_symbol"), martL = hmart)
	map = cbind(PROBEID=dic$AFFY.Mouse430.2.probe,SYMBOL = dic$HGNC.symbol)
} else if(anno=='hugene.1.0.st.v1'){
	library(hugene10sttranscriptcluster.db)
	map = select(hugene10sttranscriptcluster.db, rownames(M), c("SYMBOL","ENTREZID", "GENENAME"))
} else if(anno=='mogene.1.0.st.v1'){
	library(mogene10sttranscriptcluster.db)
	map1 = select(mogene10sttranscriptcluster.db, rownames(M), c("SYMBOL","ENTREZID", "GENENAME"))
	library(biomaRt)
	mmart = useEnsembl("ensembl",dataset = "mmusculus_gene_ensembl")
	hmart =  useEnsembl("ensembl",dataset = "hsapiens_gene_ensembl")
	dic = getLDS(attributes = c("affy_mogene_1_0_st_v1"), filters = "affy_mogene_1_0_st_v1", values = map1$PROBEID, mart = mmart, attributesL = c("hgnc_symbol"), martL = hmart)
	map = cbind(PROBEID=dic$AFFY.MoGene.1.0.st.v1.probe,SYMBOL = dic$HGNC.symbol)
}

colnames(M) = samples$dataset[pmatch(samples$dataset,colnames(M))]
rownames(M) = map[match(rownames(M), map[,"PROBEID"]),'SYMBOL']
M = M[!is.na(rownames(M)),]

# select one probe for each gene based on variance
variance = apply(M,1,var,na.rm=TRUE)
variance.aggregated.index =tapply(seq_along(variance), names(variance), function(x){
  m = max(variance[x])
  I = which(variance[x]==m)
  x[I[1]]
})
M = M[variance.aggregated.index,]
write.table(M, "data_fRMA_normalized.csv",sep="\t",row.names=TRUE,quote=FALSE)

# reload
M = read.csv("data_fRMA_normalized.csv",sep="\t",header=TRUE,stringsAsFactors=FALSE,quote="",check.names=FALSE)
colnames(M) = samples$class[match(colnames(M),samples$dataset)]

###############
# expr values #
###############
M_a = M[,which(colnames(M)=="control"),drop=FALSE]
M_b = M[,which(colnames(M)=="treated"),drop=FALSE]
exprs = cbind(apply(M_a,1,mean),apply(M_b,1,mean)) #assign the mean value across replicates
write.table(exprs,file="genes_expr_values.txt",col.names=FALSE,row.names=TRUE,sep="\t",quote=FALSE)

##################
# boolean values #
##################
bool = barcode(frmab,output="p-value")
bool_a = bool[,which(colnames(M)=="control"),drop=FALSE]
bool_b = bool[,which(colnames(M)=="treated"),drop=FALSE]
bool = cbind(apply(bool_a,1,min),apply(bool_b,1,min)) # min p-value across replicates

rownames(bool) = map[match(rownames(bool), map[,"PROBEID"]),'SYMBOL']
bool_ag = aggregate(bool,list(rownames(bool)),min) # min p-value across probes
write.table(bool_ag,"genes_pval.txt",col.names=FALSE,row.names=FALSE,sep="\t",quote=FALSE)

#################
# p(expression) #
#################
p = barcode_weights(frmab) # see barcode-weights.r
p_a = p[,which(colnames(M)=="control"),drop=FALSE]
p_b = p[,which(colnames(M)=="treated"),drop=FALSE]
p = cbind(apply(p_a,1,max),apply(p_b,1,max)) # max p(expression) across replicates

rownames(p) = map[match(rownames(p), map[,"PROBEID"]),'SYMBOL']
p_ag = aggregate(p,list(rownames(p)),max) # max p(expression) across probes
write.table(p_ag,"genes_weights.txt",col.names=FALSE,row.names=FALSE,sep="\t",quote=FALSE)